import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
df = pd.read_csv('beer_profile_and_ratings.csv')
df.head()
| Name | Style | Brewery | Beer Name (Full) | Description | ABV | Min IBU | Max IBU | Astringency | Body | ... | Fruits | Hoppy | Spices | Malty | review_aroma | review_appearance | review_palate | review_taste | review_overall | number_of_reviews | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Amber | Altbier | Alaskan Brewing Co. | Alaskan Brewing Co. Alaskan Amber | Notes:Richly malty and long on the palate, wit... | 5.3 | 25 | 50 | 13 | 32 | ... | 33 | 57 | 8 | 111 | 3.498994 | 3.636821 | 3.556338 | 3.643863 | 3.847082 | 497 |
| 1 | Double Bag | Altbier | Long Trail Brewing Co. | Long Trail Brewing Co. Double Bag | Notes:This malty, full-bodied double alt is al... | 7.2 | 25 | 50 | 12 | 57 | ... | 24 | 35 | 12 | 84 | 3.798337 | 3.846154 | 3.904366 | 4.024948 | 4.034304 | 481 |
| 2 | Long Trail Ale | Altbier | Long Trail Brewing Co. | Long Trail Brewing Co. Long Trail Ale | Notes:Long Trail Ale is a full-bodied amber al... | 5.0 | 25 | 50 | 14 | 37 | ... | 10 | 54 | 4 | 62 | 3.409814 | 3.667109 | 3.600796 | 3.631300 | 3.830239 | 377 |
| 3 | Doppelsticke | Altbier | Uerige Obergärige Hausbrauerei GmbH / Zum Uerige | Uerige Obergärige Hausbrauerei GmbH / Zum Ueri... | Notes: | 8.5 | 25 | 50 | 13 | 55 | ... | 49 | 40 | 16 | 119 | 4.148098 | 4.033967 | 4.150815 | 4.205163 | 4.005435 | 368 |
| 4 | Sleigh'r Dark Doüble Alt Ale | Altbier | Ninkasi Brewing Company | Ninkasi Brewing Company Sleigh'r Dark Doüble A... | Notes:Called 'Dark Double Alt' on the label.Se... | 7.2 | 25 | 50 | 25 | 51 | ... | 11 | 51 | 20 | 95 | 3.625000 | 3.973958 | 3.734375 | 3.765625 | 3.817708 | 96 |
5 rows × 25 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3197 entries, 0 to 3196 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 3197 non-null object 1 Style 3197 non-null object 2 Brewery 3197 non-null object 3 Beer Name (Full) 3197 non-null object 4 Description 3197 non-null object 5 ABV 3197 non-null float64 6 Min IBU 3197 non-null int64 7 Max IBU 3197 non-null int64 8 Astringency 3197 non-null int64 9 Body 3197 non-null int64 10 Alcohol 3197 non-null int64 11 Bitter 3197 non-null int64 12 Sweet 3197 non-null int64 13 Sour 3197 non-null int64 14 Salty 3197 non-null int64 15 Fruits 3197 non-null int64 16 Hoppy 3197 non-null int64 17 Spices 3197 non-null int64 18 Malty 3197 non-null int64 19 review_aroma 3197 non-null float64 20 review_appearance 3197 non-null float64 21 review_palate 3197 non-null float64 22 review_taste 3197 non-null float64 23 review_overall 3197 non-null float64 24 number_of_reviews 3197 non-null int64 dtypes: float64(6), int64(14), object(5) memory usage: 624.5+ KB
df.describe()
| ABV | Min IBU | Max IBU | Astringency | Body | Alcohol | Bitter | Sweet | Sour | Salty | Fruits | Hoppy | Spices | Malty | review_aroma | review_appearance | review_palate | review_taste | review_overall | number_of_reviews | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 | 3197.000000 |
| mean | 6.526688 | 21.180482 | 38.986863 | 16.515796 | 46.129496 | 17.055990 | 36.364404 | 58.270879 | 33.145449 | 1.017204 | 38.529559 | 40.924617 | 18.345637 | 75.330935 | 3.638789 | 3.754393 | 3.660428 | 3.702496 | 3.747522 | 233.284955 |
| std | 2.546997 | 13.242242 | 21.355281 | 10.410661 | 25.947842 | 17.331334 | 25.791152 | 34.281310 | 35.780172 | 2.132651 | 32.296646 | 30.403641 | 23.756582 | 39.909338 | 0.503209 | 0.403416 | 0.449937 | 0.510361 | 0.444288 | 361.811847 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.509615 | 1.571429 | 1.285714 | 1.214286 | 1.136364 | 1.000000 |
| 25% | 5.000000 | 15.000000 | 25.000000 | 9.000000 | 29.000000 | 6.000000 | 17.000000 | 33.000000 | 11.000000 | 0.000000 | 12.000000 | 18.000000 | 4.000000 | 45.000000 | 3.422559 | 3.604651 | 3.470021 | 3.500000 | 3.566667 | 23.000000 |
| 50% | 6.000000 | 20.000000 | 35.000000 | 14.000000 | 40.000000 | 11.000000 | 31.000000 | 54.000000 | 22.000000 | 0.000000 | 29.000000 | 33.000000 | 10.000000 | 73.000000 | 3.720183 | 3.833333 | 3.741667 | 3.791667 | 3.830239 | 93.000000 |
| 75% | 7.600000 | 25.000000 | 45.000000 | 21.000000 | 58.000000 | 22.000000 | 52.000000 | 77.000000 | 42.000000 | 1.000000 | 60.000000 | 56.000000 | 23.000000 | 103.000000 | 3.978000 | 4.000000 | 3.965587 | 4.033333 | 4.032847 | 284.000000 |
| max | 57.500000 | 65.000000 | 100.000000 | 81.000000 | 175.000000 | 139.000000 | 150.000000 | 263.000000 | 284.000000 | 48.000000 | 175.000000 | 172.000000 | 184.000000 | 239.000000 | 5.000000 | 4.666667 | 5.000000 | 5.000000 | 5.000000 | 3290.000000 |
## Assess correlations visually:
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7fa7ebaa06a0>
# I think all of the review types are averaged into 'review_overall' rating, so I tried to spot other (not as intuitive) correlations.
df.plot.scatter(x='Body',y='Malty')
df.plot.scatter(x='Bitter', y='Hoppy')
df.plot.scatter(x='Sour', y='Astringency')
<AxesSubplot:xlabel='Sour', ylabel='Astringency'>
##Check for correlations numerically:
df.corr()
| ABV | Min IBU | Max IBU | Astringency | Body | Alcohol | Bitter | Sweet | Sour | Salty | Fruits | Hoppy | Spices | Malty | review_aroma | review_appearance | review_palate | review_taste | review_overall | number_of_reviews | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ABV | 1.000000 | 0.432005 | 0.501037 | -0.169521 | 0.241673 | 0.654908 | 0.067388 | 0.463487 | 0.100795 | -0.120089 | 0.291001 | -0.052596 | 0.191468 | 0.162060 | 0.416197 | 0.349139 | 0.392517 | 0.381200 | 0.251838 | 0.142176 |
| Min IBU | 0.432005 | 1.000000 | 0.854248 | -0.071501 | 0.325338 | 0.323694 | 0.539452 | 0.227139 | -0.073098 | -0.057512 | 0.066335 | 0.407475 | -0.046152 | 0.300041 | 0.343730 | 0.380908 | 0.352723 | 0.339885 | 0.289972 | 0.219822 |
| Max IBU | 0.501037 | 0.854248 | 1.000000 | -0.120273 | 0.310617 | 0.392811 | 0.478080 | 0.277292 | -0.043275 | -0.083214 | 0.172929 | 0.345168 | 0.044533 | 0.288219 | 0.369456 | 0.379655 | 0.355608 | 0.346557 | 0.271969 | 0.221572 |
| Astringency | -0.169521 | -0.071501 | -0.120273 | 1.000000 | -0.059540 | -0.171987 | 0.114686 | -0.021456 | 0.571030 | 0.347155 | 0.345232 | 0.330951 | -0.083795 | -0.082085 | 0.068579 | 0.070948 | 0.106950 | 0.092021 | 0.159788 | -0.050062 |
| Body | 0.241673 | 0.325338 | 0.310617 | -0.059540 | 1.000000 | 0.268885 | 0.542236 | 0.458842 | -0.126733 | -0.099277 | -0.048155 | 0.070138 | 0.185123 | 0.754228 | 0.360234 | 0.421331 | 0.345270 | 0.365055 | 0.313824 | 0.039057 |
| Alcohol | 0.654908 | 0.323694 | 0.392811 | -0.171987 | 0.268885 | 1.000000 | 0.009088 | 0.527039 | 0.048767 | -0.094329 | 0.254299 | -0.079949 | 0.252876 | 0.270106 | 0.242989 | 0.184693 | 0.205569 | 0.206701 | 0.069789 | 0.048346 |
| Bitter | 0.067388 | 0.539452 | 0.478080 | 0.114686 | 0.542236 | 0.009088 | 1.000000 | 0.091705 | -0.136914 | 0.004693 | -0.093450 | 0.712887 | -0.084048 | 0.565570 | 0.241457 | 0.331854 | 0.257267 | 0.253373 | 0.260874 | 0.112026 |
| Sweet | 0.463487 | 0.227139 | 0.277292 | -0.021456 | 0.458842 | 0.527039 | 0.091705 | 1.000000 | 0.257913 | -0.131918 | 0.482030 | -0.034327 | 0.107548 | 0.471032 | 0.356514 | 0.332311 | 0.315767 | 0.322277 | 0.207293 | 0.048487 |
| Sour | 0.100795 | -0.073098 | -0.043275 | 0.571030 | -0.126733 | 0.048767 | -0.136914 | 0.257913 | 1.000000 | 0.098173 | 0.785883 | 0.068895 | 0.001831 | -0.303266 | 0.289320 | 0.204817 | 0.245169 | 0.244314 | 0.213449 | 0.006642 |
| Salty | -0.120089 | -0.057512 | -0.083214 | 0.347155 | -0.099277 | -0.094329 | 0.004693 | -0.131918 | 0.098173 | 1.000000 | 0.026920 | 0.172606 | -0.023079 | -0.028241 | -0.088210 | -0.081388 | -0.038413 | -0.057479 | -0.008199 | -0.029680 |
| Fruits | 0.291001 | 0.066335 | 0.172929 | 0.345232 | -0.048155 | 0.254299 | -0.093450 | 0.482030 | 0.785883 | 0.026920 | 1.000000 | 0.110407 | 0.148281 | -0.196890 | 0.386676 | 0.288594 | 0.324906 | 0.319690 | 0.261504 | 0.106223 |
| Hoppy | -0.052596 | 0.407475 | 0.345168 | 0.330951 | 0.070138 | -0.079949 | 0.712887 | -0.034327 | 0.068895 | 0.172606 | 0.110407 | 1.000000 | -0.131964 | 0.195767 | 0.099832 | 0.180224 | 0.159859 | 0.125560 | 0.175071 | 0.093800 |
| Spices | 0.191468 | -0.046152 | 0.044533 | -0.083795 | 0.185123 | 0.252876 | -0.084048 | 0.107548 | 0.001831 | -0.023079 | 0.148281 | -0.131964 | 1.000000 | 0.061399 | 0.272312 | 0.196014 | 0.199024 | 0.217365 | 0.141125 | -0.014847 |
| Malty | 0.162060 | 0.300041 | 0.288219 | -0.082085 | 0.754228 | 0.270106 | 0.565570 | 0.471032 | -0.303266 | -0.028241 | -0.196890 | 0.195767 | 0.061399 | 1.000000 | 0.205665 | 0.288089 | 0.231032 | 0.236624 | 0.212990 | 0.006598 |
| review_aroma | 0.416197 | 0.343730 | 0.369456 | 0.068579 | 0.360234 | 0.242989 | 0.241457 | 0.356514 | 0.289320 | -0.088210 | 0.386676 | 0.099832 | 0.272312 | 0.205665 | 1.000000 | 0.855629 | 0.907433 | 0.936670 | 0.870504 | 0.174033 |
| review_appearance | 0.349139 | 0.380908 | 0.379655 | 0.070948 | 0.421331 | 0.184693 | 0.331854 | 0.332311 | 0.204817 | -0.081388 | 0.288594 | 0.180224 | 0.196014 | 0.288089 | 0.855629 | 1.000000 | 0.869978 | 0.850693 | 0.813444 | 0.191213 |
| review_palate | 0.392517 | 0.352723 | 0.355608 | 0.106950 | 0.345270 | 0.205569 | 0.257267 | 0.315767 | 0.245169 | -0.038413 | 0.324906 | 0.159859 | 0.199024 | 0.231032 | 0.907433 | 0.869978 | 1.000000 | 0.946520 | 0.918154 | 0.185631 |
| review_taste | 0.381200 | 0.339885 | 0.346557 | 0.092021 | 0.365055 | 0.206701 | 0.253373 | 0.322277 | 0.244314 | -0.057479 | 0.319690 | 0.125560 | 0.217365 | 0.236624 | 0.936670 | 0.850693 | 0.946520 | 1.000000 | 0.937102 | 0.190588 |
| review_overall | 0.251838 | 0.289972 | 0.271969 | 0.159788 | 0.313824 | 0.069789 | 0.260874 | 0.207293 | 0.213449 | -0.008199 | 0.261504 | 0.175071 | 0.141125 | 0.212990 | 0.870504 | 0.813444 | 0.918154 | 0.937102 | 1.000000 | 0.176511 |
| number_of_reviews | 0.142176 | 0.219822 | 0.221572 | -0.050062 | 0.039057 | 0.048346 | 0.112026 | 0.048487 | 0.006642 | -0.029680 | 0.106223 | 0.093800 | -0.014847 | 0.006598 | 0.174033 | 0.191213 | 0.185631 | 0.190588 | 0.176511 | 1.000000 |
target = 'Sour'
train_cols = [a for a in df.columns if a != target]
X = df[train_cols]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)
## Compare with baseline model, using just mean values:
baseline = y_train.mean()
y_pred_train = [baseline]*len(X_train)
y_pred_test = [baseline]*len(X_test)
# squared=False or else you will be getting squared values!
train_rmse = mean_squared_error(y_pred_train,y_train,squared=False)
test_rmse = mean_squared_error(y_pred_test,y_test,squared=False)
train_r2 = r2_score(y_pred_train,y_train)
test_r2 = r2_score(y_test,y_pred_test)
print('rmse train:',train_rmse,'rmse test:', test_rmse)
print("r2 train:", train_r2,'r2 test:', test_r2)
rmse train: 36.329438302026844 rmse test: 34.448542812443726 r2 train: -6.53547213717035e+30 r2 test: -0.00030434775618526366
So the r-squared to beat is.. actually pretty bad. Should be easy to improve on this model.
## I will try a linear regression using the 'Fruits' values to predict level of 'Sour':
reg = LinearRegression()
X_train_univariate = X_train['Fruits'].values.reshape(-1,1)
X_test_univariate = X_test['Fruits'].values.reshape(-1,1)
reg.fit(X_train_univariate, y_train)
print("regression parameters:", reg.coef_[0],reg.intercept_)
regression parameters: 0.8688020267269813 -0.142996627279544
train_rmse = mean_squared_error(reg.predict(X_train_univariate),y_train,squared=False)
test_rmse = mean_squared_error(reg.predict(X_test_univariate),y_test,squared=False)
train_r2 = r2_score(reg.predict(X_train_univariate),y_train)
test_r2 = r2_score(reg.predict(X_test_univariate),y_test)
print('rmse train:',train_rmse,'rmse test:', test_rmse)
print("r2 train:", train_r2,'r2 test:', test_r2)
rmse train: 22.677000979470822 rmse test: 20.7748016291049 r2 train: 0.36164592053369427 r2 test: 0.4198851938837118
The r-squared is positive! Okay, that is okay for only using one variable, and it is an improvement from the baseline.
I can improve on this model with more variables, but first let's see that trendline:
## Evaluating the trend line:
df.plot.scatter(x='Fruits',y='Sour',marker='o',c="g",alpha=0.5)
plt.scatter(x=df['Fruits'],y=reg.predict(df['Fruits'].values.reshape(-1,1)),marker='+',label='trend_line')
<matplotlib.collections.PathCollection at 0x7fa7e2b1ffd0>
Let's see if adding 'Astringency' can help predict level of 'Sour' (sourness):
reg_multiple = LinearRegression()
multiple_columns = ['Fruits','Astringency']
reg_multiple.fit(X_train[multiple_columns], y_train)
print("regression parameters:", reg.coef_,reg.intercept_)
regression parameters: [0.86880203] -0.142996627279544
## Check r-squared for this model:
train_rmse = mean_squared_error(reg_multiple.predict(X_train[multiple_columns]),y_train,squared=False)
test_rmse = mean_squared_error(reg_multiple.predict(X_test[multiple_columns]),y_test,squared=False)
train_r2 = r2_score(reg_multiple.predict(X_train[multiple_columns]),y_train)
test_r2 = r2_score(reg_multiple.predict(X_test[multiple_columns]),y_test)
print('rmse train:',train_rmse,'rmse test:', test_rmse)
print("r2 train:", train_r2,'r2 test:', test_r2)
rmse train: 19.393115002418288 rmse test: 17.88770762454073 r2 train: 0.6014846979040045 r2 test: 0.6571811036469168
Let's see if any of the other variables with less hints of correlation can improve the model:
## Can adding metric for sweetness improve the model?:
reg_multiple = LinearRegression()
multiple_columns = ['Fruits','Astringency','Sweet']
reg_multiple.fit(X_train[multiple_columns], y_train)
print("regression parameters:", reg.coef_,reg.intercept_)
regression parameters: [0.86880203] -0.142996627279544
## Check r-squared with sweetness added:
train_rmse = mean_squared_error(reg_multiple.predict(X_train[multiple_columns]),y_train,squared=False)
test_rmse = mean_squared_error(reg_multiple.predict(X_test[multiple_columns]),y_test,squared=False)
train_r2 = r2_score(reg_multiple.predict(X_train[multiple_columns]),y_train)
test_r2 = r2_score(reg_multiple.predict(X_test[multiple_columns]),y_test)
print('rmse train:',train_rmse,'rmse test:', test_rmse)
print("r2 train:", train_r2,'r2 test:', test_r2)
rmse train: 19.22033019599842 rmse test: 17.791411883218753 r2 train: 0.6113022158064989 r2 test: 0.6617225875257344
Not much of an imrovement there. Let's try adding 'Malty', Astringency', and 'review_appearance':
#Adding 'Astringency', 'Malty' and 'review_appearance':
reg_multiple = LinearRegression()
multiple_columns = ['Fruits','Astringency','Malty','review_appearance','Sweet']
reg_multiple.fit(X_train[multiple_columns], y_train)
print("regression parameters:", reg.coef_,reg.intercept_)
regression parameters: [0.86880203] -0.142996627279544
train_rmse = mean_squared_error(reg_multiple.predict(X_train[multiple_columns]),y_train,squared=False)
test_rmse = mean_squared_error(reg_multiple.predict(X_test[multiple_columns]),y_test,squared=False)
train_r2 = r2_score(reg_multiple.predict(X_train[multiple_columns]),y_train)
test_r2 = r2_score(reg_multiple.predict(X_test[multiple_columns]),y_test)
print('rmse train:',train_rmse,'rmse test:', test_rmse)
print("r2 train:", train_r2,'r2 test:', test_r2)
rmse train: 18.456619232390146 rmse test: 17.163116228854356 r2 train: 0.6521105993052694 r2 test: 0.6929781616244106